library(aplore3)
library(ggplot2)
library(dplyr)
library(plotly)
library(GGally)
library(ca)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)
library(sjPlot)
library(sjmisc)
library(caret)
library(MASS)
library(randomForest)
library(vcd)
library(FactoMineR)
library(pROC)
#??glow500
Data Format: A data.frame with 500 rows and 18 variables such as:
priorfrac - If the patient previously had a fracture
age - Age at
Enrollment (years)
weight - Weight at enrollment (Kilograms)
height - Height at enrollment (Centimeters)
bmi - Body Mass Index
(Kg/m^2)
premeno - Menopause before age 45 (1: No, 2: Yes)
momfrac - Mother had hip fracture (1: No, 2: Yes)
armassist - Arms
are needed to stand from a chair (1: No, 2: Yes)
smoke - Former or
current smoker (1: No, 2: Yes)
raterisk - Self-reported risk of
fracture (1: Less than others of the same age, 2: Same as others of the
same age, 3: Greater than others of the same age
fracscore -
Fracture Risk Score (Composite Risk Score)
fracture - Any fracture
in first year (1: No, 2: Yes)
bonemed - Bone medications at
enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at
follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at
enrollment and follow-up (1: No, 2: Yes)
rm(glow_bonemed)
## Warning in rm(glow_bonemed): object 'glow_bonemed' not found
data("glow_bonemed", package = "aplore3")
head(glow_bonemed)
## sub_id site_id phy_id priorfrac age weight height bmi premeno momfrac
## 1 1 1 14 No 62 70.3 158 28.16055 No No
## 2 2 4 284 No 65 87.1 160 34.02344 No No
## 3 3 6 305 Yes 88 50.8 157 20.60936 No Yes
## 4 4 6 309 No 82 62.1 160 24.25781 No No
## 5 5 1 37 No 61 68.0 152 29.43213 No No
## 6 6 5 299 Yes 67 68.0 161 26.23356 No No
## armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1 No No Same 1 No No No No
## 2 No No Same 2 No No No No
## 3 Yes No Less 11 No No No No
## 4 No No Less 5 No No No No
## 5 No No Same 1 No No No No
## 6 No Yes Same 4 No No No No
Summary statistics for numeric variables
#Set fracscore to integer for summary statistics
glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
mysummary = glow_bonemed %>%
dplyr::select(age, weight, height, bmi, fracscore) %>%
summarise_each(
funs(min = min,
q25 = quantile(., 0.25),
median = median,
q75 = quantile(., 0.75),
max = max,
mean = mean,
sd = sd,
variance= var))
# reshape it using tidyr functions
clean.summary = mysummary %>%
gather(stat, val) %>%
separate(stat, into = c("var", "stat"), sep = "_") %>%
spread(stat, val) %>%
dplyr::select(var, min, max, mean, sd, variance)
print(clean.summary)
## var min max mean sd variance
## 1 age 55.00000 90.00000 68.56200 8.989537 80.811780
## 2 bmi 14.87637 49.08241 27.55303 5.973958 35.688178
## 3 fracscore 0.00000 11.00000 3.69800 2.495446 6.227251
## 4 height 134.00000 199.00000 161.36400 6.355493 40.392289
## 5 weight 39.90000 127.00000 71.82320 16.435992 270.141825
Summary statistics for categorical variables
summary(glow_bonemed %>% dplyr::select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
## priorfrac premeno momfrac armassist smoke raterisk fracture
## No :374 No :403 No :435 No :312 No :465 Less :167 No :375
## Yes:126 Yes: 97 Yes: 65 Yes:188 Yes: 35 Same :186 Yes:125
## Greater:147
## bonemed bonemed_fu bonetreat
## No :371 No :361 No :382
## Yes:129 Yes:139 Yes:118
##
No missing values
colSums(is.na(glow_bonemed))
## sub_id site_id phy_id priorfrac age weight height
## 0 0 0 0 0 0 0
## bmi premeno momfrac armassist smoke raterisk fracscore
## 0 0 0 0 0 0 0
## fracture bonemed bonemed_fu bonetreat
## 0 0 0 0
sum(is.na(glow_bonemed))
## [1] 0
Age vs Weight: As weight increases the average age decreases
Age
vs Height: Weak correlation of as height increases age decreases
Age
vs BMI: As bmi increases the average age decreases
Age vs fracscore:
As age increases the average fracscore increases
Weight vs Height: As height increases the average weight
increases
Weight vs BMI: As bmi increases the average weight
increases
Weight vs fracscore: As fracscore increases the average
Weight decreases
Height vs BMI: As bmi increases the average height and variance stay
the same
Height vs fracscore: As fracscore increases the average
height stays the same though variance might decrease
BMI vs fracscore: As fracscore increases the average bmi
decreases
plot(glow_bonemed[, c(5:8, 14)])
ggplotly(ggpairs(glow_bonemed[, c(5:8, 14)], ggplot2::aes(color = glow_bonemed$fracture), lower = list(continuous = wrap("smooth", alpha = 0.5, size = 0.9))))
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Can only have one: highlight
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Can only have one: highlight
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Can only have one: highlight
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Can only have one: highlight
Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable
ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
geom_jitter() +
labs(title = "BMI vs Age")
ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
geom_jitter() +
labs(title = "Fracture Score vs BMI")
ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
geom_jitter() +
labs(title = "Age vs Fracture Score")
ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
geom_jitter() +
labs(title = "Height vs Weight")
Once again there doesn’t seem to be strong groupings of the fracture categorical variable
fracture3dplot = plot_ly(glow_bonemed,
x = ~age,
y = ~height,
z = ~bmi,
color = ~fracture,
colors = c('#0C4B8E', '#BF382A')) %>% add_markers()
fracture3dplot
There are so little “yes” fracture results that the plot isn’t very useful
## `geom_smooth()` using formula = 'y ~ x'
The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers
ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
geom_boxplot() +
labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")
ggplot(glow_bonemed, aes(x = fracture, y = fracscore)) +
geom_boxplot() +
labs(title = "Fracture score distribution for fracture groups")
ggplot(glow_bonemed, aes(x = fracscore, y = bonetreat, color = fracture)) +
geom_point(position = position_jitter(width = 0.2, height = 0.1)) +
labs(x = "Fracture Score", y = "Recieved bone medication treatment at enrollment and at follow up", color = "Fracture") +
ggtitle("Difference in Fracture Score vs bonetreatement at enrollment and follow up")
Plot confirms there is a strong correlation between age/fracscore, bmi/weight
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)
ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
colors = c("#6D9EC1", "white", "#E46726")) +
labs(title = "Correlation Between Variables") +
theme(plot.title = element_text(hjust = .5),
plot.subtitle = element_text(hjust = .5))
par(mfrow = c(2,4))
mosaicplot(table(glow_bonemed$bonetreat, glow_bonemed$fracture),
color = TRUE,
xlab = "bonetreat",
ylab = "Fracture",
main = "Bone Treatement vs Fracture")
mosaicplot(table(glow_bonemed$priorfrac, glow_bonemed$fracture),
color = TRUE,
xlab = "priorfrac",
ylab = "Fracture",
main = "Prior Fracture vs Fracture")
mosaicplot(table(glow_bonemed$bonemed, glow_bonemed$fracture),
color = TRUE,
xlab = "bonemed",
ylab = "Fracture",
main = "Bonemed vs Fracture")
mosaicplot(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture),
color = TRUE,
xlab = "bonemed_fu",
ylab = "Fracture",
main = "Bonemed_fu vs Fracture")
mosaicplot(table(glow_bonemed$smoke, glow_bonemed$fracture),
color = TRUE,
xlab = "smoke",
ylab = "Fracture",
main = "Smoke vs Fracture")
mosaicplot(table(glow_bonemed$armassist, glow_bonemed$fracture),
color = TRUE,
xlab = "armassist",
ylab = "Fracture",
main = "Armassist vs Fracture")
mosaicplot(table(glow_bonemed$momfrac, glow_bonemed$fracture),
color = TRUE,
xlab = "momfrac",
ylab = "Fracture",
main = "Mother Fracture vs Fracture")
mosaicplot(table(glow_bonemed$premeno, glow_bonemed$fracture),
color = TRUE,
xlab = "premeno",
ylab = "Fracture",
main = "Premeno Treatement vs Fracture")
chisq.test(table(glow_bonemed$priorfrac, glow_bonemed$fracture))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(glow_bonemed$priorfrac, glow_bonemed$fracture)
## X-squared = 22.635, df = 1, p-value = 1.959e-06
chisq.test(table(glow_bonemed$bonemed, glow_bonemed$fracture))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(glow_bonemed$bonemed, glow_bonemed$fracture)
## X-squared = 9.7822, df = 1, p-value = 0.001762
chisq.test(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(glow_bonemed$bonemed_fu, glow_bonemed$fracture)
## X-squared = 16.743, df = 1, p-value = 4.279e-05
chisq.test(table(glow_bonemed$bonetreat, glow_bonemed$fracture))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(glow_bonemed$bonetreat, glow_bonemed$fracture)
## X-squared = 5.9159, df = 1, p-value = 0.015
Perform chi-squared test on all categorical variables
glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)
blow_bonemed_categoricals = glow_bonemed[, c(4, 9:18)]
multipleCorrespondenceAnalysis = MCA(blow_bonemed_categoricals, quali.sup = "fracture", graph = FALSE)
multipleCorrespondenceAnalysis
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 500 individuals, described by 11 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$quali.sup" "results for the supplementary categorical variables"
## 12 "$quali.sup$coord" "coord. for the supplementary categories"
## 13 "$quali.sup$cos2" "cos2 for the supplementary categories"
## 14 "$quali.sup$v.test" "v-test for the supplementary categories"
## 15 "$call" "intermediate results"
## 16 "$call$marge.col" "weights of columns"
## 17 "$call$marge.li" "weights of rows"
All of the categorical variables seem to have significant P values with bonetreate, bonemed, and bonemed_fu all having R^2 above .83
firstDimension = dimdesc(multipleCorrespondenceAnalysis, axes = 1)
firstDimension
## $`Dim 1`
##
## Link between the variable and the categorical variable (1-way anova)
## =============================================
## R2 p.value
## bonetreat 0.87245886 7.754159e-225
## bonemed 0.84726742 2.443029e-205
## bonemed_fu 0.83803940 5.422994e-199
## raterisk 0.19571783 3.118876e-24
## fracscore 0.19131230 2.223696e-17
## priorfrac 0.10838142 4.201009e-14
## fracture 0.04922945 5.400633e-07
## armassist 0.02280481 7.047091e-04
## premeno 0.01718799 3.315107e-03
## smoke 0.01441500 7.194873e-03
##
## Link between variable and the categories of the categorical variables
## ================================================================
## Estimate p.value
## bonetreat=bonetreat_Yes 0.61370510 7.754159e-225
## bonemed=bonemed_Yes 0.58693281 2.443029e-205
## bonemed_fu=bonemed_fu_Yes 0.57007390 5.422994e-199
## raterisk=Greater 0.32172629 2.159398e-19
## priorfrac=priorfrac_Yes 0.21155159 4.201009e-14
## fracscore=10 1.42877661 4.142645e-07
## fracture=fracture_Yes 0.14295580 5.400633e-07
## fracscore=9 0.47787785 1.527510e-04
## fracscore=8 0.21858281 3.120054e-04
## armassist=armassist_Yes 0.08697950 7.047091e-04
## fracscore=7 0.09452369 8.158244e-04
## premeno=premeno_No 0.09249838 3.315107e-03
## smoke=smoke_No 0.13128246 7.194873e-03
## fracscore=5 -0.05350173 4.364518e-02
## smoke=smoke_Yes -0.13128246 7.194873e-03
## premeno=premeno_Yes -0.09249838 3.315107e-03
## fracscore=1 -0.40626672 1.010225e-03
## armassist=armassist_No -0.08697950 7.047091e-04
## fracscore=0 -0.49480779 3.625746e-05
## fracture=fracture_No -0.14295580 5.400633e-07
## priorfrac=priorfrac_No -0.21155159 4.201009e-14
## raterisk=Less -0.30243929 2.386909e-17
## bonemed_fu=bonemed_fu_No -0.57007390 5.422994e-199
## bonemed=bonemed_No -0.58693281 2.443029e-205
## bonetreat=bonetreat_No -0.61370510 7.754159e-225
bonetreat, bonemed, bonemed_fu all have a sensitivity around 78% and specificity between 32% and 42%
confusionMatrix(table(glow_bonemed$bonetreat, glow_bonemed$fracture))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 297 85
## Yes 78 40
##
## Accuracy : 0.674
## 95% CI : (0.631, 0.715)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.1141
##
## Mcnemar's Test P-Value : 0.6384
##
## Sensitivity : 0.7920
## Specificity : 0.3200
## Pos Pred Value : 0.7775
## Neg Pred Value : 0.3390
## Prevalence : 0.7500
## Detection Rate : 0.5940
## Detection Prevalence : 0.7640
## Balanced Accuracy : 0.5560
##
## 'Positive' Class : No
##
confusionMatrix(table(glow_bonemed$bonemed, glow_bonemed$fracture))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 292 79
## Yes 83 46
##
## Accuracy : 0.676
## 95% CI : (0.633, 0.7169)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.1451
##
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.7787
## Specificity : 0.3680
## Pos Pred Value : 0.7871
## Neg Pred Value : 0.3566
## Prevalence : 0.7500
## Detection Rate : 0.5840
## Detection Prevalence : 0.7420
## Balanced Accuracy : 0.5733
##
## 'Positive' Class : No
##
confusionMatrix(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 289 72
## Yes 86 53
##
## Accuracy : 0.684
## 95% CI : (0.6412, 0.7246)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.9996
##
## Kappa : 0.1877
##
## Mcnemar's Test P-Value : 0.3010
##
## Sensitivity : 0.7707
## Specificity : 0.4240
## Pos Pred Value : 0.8006
## Neg Pred Value : 0.3813
## Prevalence : 0.7500
## Detection Rate : 0.5780
## Detection Prevalence : 0.7220
## Balanced Accuracy : 0.5973
##
## 'Positive' Class : No
##
glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)
blow_bonemed_categoricals = glow_bonemed[, c(15:18)]
multipleCorrespondenceAnalysis = MCA(blow_bonemed_categoricals, graph = FALSE)
plot.MCA(multipleCorrespondenceAnalysis,
cex = 0.8,
col.quali.sup = c("red", "blue", "green"),
title = "Multiple Correspondence Analysis")
pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))
pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))
zScoreScale = scale(glow_bonemed[, 5:8])
zScoreDistance = dist(zScoreScale)
continuousVariableClustering = hclust(zScoreDistance, method = "complete")
plot(continuousVariableClustering)
Split the data into a training/testing set
set.seed(4)
trainingIndices = sample(c(1:dim(glow_bonemed)[1]), dim(glow_bonemed)[1]*0.8)
trainingDataframe = glow_bonemed[trainingIndices,]
testingDataframe = glow_bonemed[-trainingIndices,]
Age is the only statistically significant continuous variable at the alpha = 0.2 level (p < 0.0001)
model = glm(fracture ~ age + weight + height + bmi, data = glow_bonemed[trainingIndices,], family = "binomial")
summary(model)
##
## Call:
## glm(formula = fracture ~ age + weight + height + bmi, family = "binomial",
## data = glow_bonemed[trainingIndices, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2770 -0.7951 -0.6672 1.2602 1.9710
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.006590 13.891104 -0.432 0.665447
## age 0.045888 0.013481 3.404 0.000664 ***
## weight -0.039155 0.095610 -0.410 0.682156
## height 0.008938 0.085578 0.104 0.916814
## bmi 0.113632 0.247382 0.459 0.645993
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 458.45 on 399 degrees of freedom
## Residual deviance: 442.53 on 395 degrees of freedom
## AIC: 452.53
##
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 452.5326
bonetreat 0.87245886 7.754159e-225 bonemed 0.84726742 2.443029e-205 bonemed_fu 0.83803940 5.422994e-199 raterisk 0.19571783 3.118876e-24 fracscore 0.19131230 2.223696e-17 priorfrac 0.10838142 4.201009e-14 fracture 0.04922945 5.400633e-07 armassist 0.02280481 7.047091e-04 premeno 0.01718799 3.315107e-03 smoke 0.01441500 7.194873e-03
model = glm(fracture ~ age + bonetreat + bonemed + bonemed_fu , data = glow_bonemed[trainingIndices,], family = "binomial")
summary(model)
##
## Call:
## glm(formula = fracture ~ age + bonetreat + bonemed + bonemed_fu,
## family = "binomial", data = glow_bonemed[trainingIndices,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4723 -0.7634 -0.6110 0.8596 1.9992
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.00531 0.93027 -4.306 1.67e-05 ***
## age 0.03844 0.01326 2.898 0.00376 **
## bonetreatYes -2.50872 0.88413 -2.838 0.00455 **
## bonemedYes 1.39128 0.70199 1.982 0.04749 *
## bonemed_fuYes 1.71650 0.51300 3.346 0.00082 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 458.45 on 399 degrees of freedom
## Residual deviance: 428.21 on 395 degrees of freedom
## AIC: 438.21
##
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 438.2137
# trying to figure out how to use sjPlot to mimic what we did in unit12 prelive
#plot_model(model, type = "pred", terms = c("age", "smoke"))
Get eigen vectors and values for principle component analysis
glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)
#Eigen Vectors
pc.result$rotation
## PC1 PC2 PC3 PC4 PC5
## age 0.4947219 0.46742140 -0.15246583 0.71654567 -0.009160237
## weight -0.5273035 0.46578775 -0.08840991 0.03240244 -0.704362523
## height -0.2345770 -0.08196149 -0.93823245 0.01885633 0.240042129
## bmi -0.4741030 0.51615173 0.24533677 0.05137380 0.667820563
## fracscore 0.4442985 0.53984137 -0.16872342 -0.69463484 0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342
Scree plot
par(mfrow = c(1, 2))
plot(eigenvals,type = "l",
main = "Scree Plot",
ylab = "Eigen Values",
xlab = "PC #")
plot(eigenvals / sum(eigenvals),
type = "l", main = "Scree Plot",
ylab = "Prop. Var. Explained",
xlab = "PC #",
ylim = c(0, 1))
cumulative.prop = cumsum(eigenvals / sum(eigenvals))
lines(cumulative.prop, lty = 2)
Naive Bayes model with categorical variables
corr_vars <- c("priorfrac", "premeno", "momfrac", "armassist", "smoke", "raterisk", "fracture", "bonemed", "bonemed_fu", "bonetreat")
set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
nb.fit = train(fracture ~ .,
data = trainingDataframe[, corr_vars],
method = "nb",
trControl = fitControl
)
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
nb.fit
## Naive Bayes
##
## 400 samples
## 9 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.6731191 0.163332
## TRUE 0.7400891 0.000000
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
## = 1.
Knn models
corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")
set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ age,
data = trainingDataframe[, corr_vars],
method = "knn",
trControl = fitControl,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
knn.fit
## k-Nearest Neighbors
##
## 400 samples
## 1 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7178299 0.04451770
## 2 0.7252048 0.04095402
## 3 0.7302720 0.05828196
## 4 0.7202689 0.03291690
## 5 0.7327720 0.05605753
## 6 0.7328971 0.06388898
## 7 0.7454644 0.08867516
## 8 0.7428361 0.07440775
## 9 0.7454644 0.07173952
## 10 0.7276438 -0.01393495
## 20 0.7299578 0.01388032
## 30 0.7302079 -0.00185982
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
After tinkering it seems the best model only uses the age variable
knnfit.predprobs.valid = predict(knn.fit, testingDataframe, type = "prob")[,"Yes"]
knnfit.roc.valid = roc(response = testingDataframe$fracture, predictor = knnfit.predprobs.valid, levels = c("No","Yes"))
## Setting direction: controls < cases
plot(knnfit.roc.valid, print.thres = "best", col = "lightblue", main = "Best threshold for Knn validation data set")
auc(knnfit.roc.valid)
## Area under the curve: 0.6389
corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")
set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ age,
data = trainingDataframe[, corr_vars],
method = "knn",
trControl = fitControl,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
knn.fit
## k-Nearest Neighbors
##
## 400 samples
## 1 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7178299 0.04451770
## 2 0.7252048 0.04095402
## 3 0.7302720 0.05828196
## 4 0.7202689 0.03291690
## 5 0.7327720 0.05605753
## 6 0.7328971 0.06388898
## 7 0.7454644 0.08867516
## 8 0.7428361 0.07440775
## 9 0.7454644 0.07173952
## 10 0.7276438 -0.01393495
## 20 0.7299578 0.01388032
## 30 0.7302079 -0.00185982
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
forestModel = randomForest(fracture ~ priorfrac + age + premeno + momfrac + armassist + smoke + fracscore + bonemed + bonemed_fu,
data = trainingDataframe,
ntree = 500,
mtry = 3,
imprtance = TRUE)
forestPredictions = predict(forestModel, testingDataframe)
confusionMatrix(forestPredictions, testingDataframe$fracture)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 74 15
## Yes 5 6
##
## Accuracy : 0.8
## 95% CI : (0.7082, 0.8733)
## No Information Rate : 0.79
## P-Value [Acc > NIR] : 0.46055
##
## Kappa : 0.2695
##
## Mcnemar's Test P-Value : 0.04417
##
## Sensitivity : 0.9367
## Specificity : 0.2857
## Pos Pred Value : 0.8315
## Neg Pred Value : 0.5455
## Prevalence : 0.7900
## Detection Rate : 0.7400
## Detection Prevalence : 0.8900
## Balanced Accuracy : 0.6112
##
## 'Positive' Class : No
##
forestfit.predprobs.valid = predict(forestModel, testingDataframe, type = "prob")[,"Yes"]
forestfit.roc.valid = roc(response = testingDataframe$fracture, predictor = forestfit.predprobs.valid, levels = c("No","Yes"))
## Setting direction: controls < cases
plot(forestfit.roc.valid, print.thres = "best", col = "black", main = "Best threshold for random forest validation data set")
auc(forestfit.roc.valid)
## Area under the curve: 0.711
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1, classProbs = TRUE, summaryFunction = mnLogLoss)
set.seed(4)
trainingDataframe = trainingDataframe[, !(names(trainingDataframe) %in% c("site_id_factor", "site_id"))]
testingDataframe = testingDataframe[, !(names(testingDataframe) %in% c("site_id_factor", "site_id"))]
colnames(trainingDataframe)[which.min(apply(trainingDataframe, 2, var))]
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## Warning in stats::var(...): NAs introduced by coercion
## [1] "fracscore"
{r} # #Version 1 # lda.fit = caret::train(fracture ~ . - sub_id - phy_id + priorfrac:fracscore + age:fracscore + fracscore:bonetreat, # data = trainingDataframe, # method = "lda", # trControl = fitControl, # preProc = c("center", "scale"), # metric = "logLoss") # # lda.fit # logLoss = 0.5702 # # #Computing predicted probabilities on the testing data # ldafit.predprobs.valid = predict(lda.fit, testingDataframe, type = "prob")[,"Yes"] # # # ldafit.roc.valid = roc(response = testingDataframe$fracture, predictor = ldafit.predprobs.valid, levels = c("No", "Yes")) #{r} # complex1 = glm(fracture ~ age + bonetreat + fracscore + priorfrac + bonemed + bonemed_fu + priorfrac:fracscore + age:fracscore + fracscore:bonetreat, data = trainingDataframe, family = "binomial") # # # Get Predictions # #Complex model from previous # complex1.predprobs<-predict(complex1,trainingDataframe ,type="response") # # # complex model ROC # complex1.roc = roc(response=trainingDataframe$fracture,predictor=complex1.predprobs,levels=c("No","Yes")) # # validateComplexPred <- predict(complex1, newdata = testingDataframe, type="response") # # # complex model ROC # complex1.roc.Valid <- roc(response=testingDataframe$fracture,predictor=validateComplexPred,levels=c("No","Yes")) #{r} # # Plot complex, lda models, and random forest # # #plot(complex1.roc.Valid,col="red") # # #plot(complex1.roc.Valid,print.thres="best",col="red") # use to print best threshold - gets messy with everything else # # #plot(ldafit.roc.valid, col="lightblue", add = T) # # # plot(ldafit.roc, col="lightblue", add = T, legend = T, print.thres="best") # use to print best threshold # # # #INSERT KNN MODEL ROC HERE SKELETON CODE MOST LIKELY: # plot.new() # plot(0,0, xlim=c(0,1), ylim=c(0,1), type="n", main="ROC curve") # # plot(knnfit.roc.valid, add = T, col = "green") # plot(complex1.roc.Valid,col="red") # plot(forestfit.roc.valid, add = T, col = "black", legend = T) # # # plot(rf.result.roc,print.thres="best", add = T, col = "black", legend = T) # # # # use to print best threshold # legend("bottomright", # legend=c("Complex", "LDA", "KNN", "Random Forest"), # col=c("red", "lightblue", "green", "black"), # lwd=4, cex =1, xpd = TRUE, horiz = FALSE) ##Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013)
Applied Logistic Regression, 3rd ed., New York: Wiley
https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90